################### PROJECT- JCR REPLICATION - Data_processing_03 ##############################################
# This R file combines the raw datasets, and creates the month-level dataset for shifts in territorial control.
# Datasets generated from this codes include dataset1_month.csv and dataset1_month_staggeredadoption.csv.
# They include all variables needed for analysis in the appendices L and M.
# NOTE: To run this data processing file, 
# please run "data processing_01.R" first and get the yearly level dataset on shifts in territorial control.
# Last updated: 16th, May 2025, by Wangyin Zhao
##############################################################################################################

##############################################################################################################
# Outline of Script
# A. Preparation
#    - Set working directory
#    - Load required packages
#
# C. Data processing
#    1. Upload the yearly level dataset on the outcome of the shift of territorial control
#    2. Upload the monthly level data on rainfall-related natural disasters   
#    3. The data on rainfall-related natural disasters from EM-DAT
#    4. The data on rainfall concentration
#         
# D. Merge the data
##############################################################################################################



#### A. Preparation ####
# Set working directory to Replication_Files folder
setwd("./Replication_Files")

# Import relevant packages
packages <- c("sf", "tidyverse", "zoo")
lapply(packages, library, character.only = TRUE)
########

#### B. Data processing ####
#### 1. Upload the yearly level dataset on the outcome of the shift of territorial control ####
# Here I disregard the variable of natural disasters at the year level 
# But extract other yearly level variables such as territorial setting and the shift of territorial control 
dataset1 <- st_read("./data/analysis/dataset1_oldschool.csv") %>%
  mutate(across(everything(), ~na_if(.x, ""))) %>%
  dplyr::select(ADM3_PCODE, ADM4_PCODE, year, Tchange_inf2, Enclave_inf2, Mixed_inf2)

########

#### 2. Upload the monthly level data on rainfall-related natural disasters ####
# As the sizes are too big, I have not included the raw data for all precipitation tif. files for each grid at each month
# the precipitation data is from the IMERG: Integrated Multi-satellite Retrievals for GPM. 
# .nc or tif. files for the precipitation for each month and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading)
# https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=&endtime=&dataKeyword=IMERG%20Final (for a more access version)

# I include the data on precipitation mean at the barangay and month level after processing and the codes for the processing all spatial files (.tif) 

# Processing codes: 

# rainfallfilelist <- list.files("data/phl/", full.names = T, pattern = "GIOVANNI-*") 
# rainfallfilesdf <- data.frame(filename=rainfallfilelist, yearmonth=stringr::str_split(basename(rainfallfilelist), pattern = "_") %>% 
#                    lapply(., function(x) x[4]) %>% unlist()) %>%
#                    mutate(yearmonth = str_sub(yearmonth,start=15, end=20))

# fulldf <- tibble()
# for(i in rainfallfilesdf$yearmonth){

# tt <- Sys.time()
# print(i)

# rainfallfilename <- rainfallfilesdf$filename[rainfallfilesdf$yearmonth==i] # name of corresponding NLT file at t-1

# rainfallfile <- terra::rast(rainfallfilename) # read NLT raster

# rainfallrast <- terra::crop(rainfallfile, phl) # crop nlt raster to gfo raster
# rainfallrast <- terra::mask(rainfallrast, phl)

# tmpres <- tibble(rainfallfile=rainfallfilename,
#                  ADM4_PCODE=phl$ADM4_PCODE,
#                  yearmonth=i,
#                  rainfallmean=exact_extract(rainfallrast, phl, "mean"))

# fulldf <- bind_rows(fulldf,tmpres)
# print(Sys.time()-tt)}

# Below codes are about coding the occurence of natural disasters at the barangay-year level step by step
# First, upload the processing data for the precipitation mean at the barangay-month level and calculate the z-value
fulldf <- read.csv("./data/analysis/rainfall_zonal stat for phl.csv")  %>%
  mutate(yearmonth = as.numeric(yearmonth)) %>%
  select(ADM4_PCODE, yearmonth, rainfallmean) %>%
  mutate(
    year = as.integer(str_sub(yearmonth, 1, 4)),
    month = as.integer(str_sub(yearmonth, 5, 6))) # some data cleaning


output_f <- fulldf %>%
  group_by(ADM4_PCODE, year, month) %>%
  summarise(rainfallmean = mean(rainfallmean, na.rm = TRUE), .groups = "drop") %>%
  arrange(ADM4_PCODE, year, month) %>%
  group_by(ADM4_PCODE, month) %>%
  mutate(
    roll = rollmean(rainfallmean, k = 11, fill = NA, align = "center"),
    roll_sd = rollapply(rainfallmean, width = 11, FUN = sd, fill = NA, align = "center")
  ) %>%
  ungroup() %>%
  filter(year > 2008, year < 2015) %>%
  mutate(z_value = (rainfallmean - roll) / roll_sd) %>%
  drop_na() # this code creates the z_value for the precipitation mean for each barangay at each month 

# Second, calculate the occurence of rainfall-related natural disasters at the barangay-month level based on the output_f
nd_month <- output_f %>% 
  mutate(z_value = as.numeric(z_value),
         ND_1.5_bar = if_else(z_value >= 1.5, 1, 0),
         ND_1.6_bar = if_else(z_value >= 1.6, 1, 0),
         ND_1.7_bar = if_else(z_value >= 1.7, 1, 0),
         ND_1.8_bar = if_else(z_value >= 1.8, 1, 0)) %>% 
  rename(z_value_bar = `z_value`) %>%
  na.omit() 
#########

#### 3. The data on rainfall-related natural disasters from EM-DAT ####
# I manually coded the occurrence of natural disasters based on location column from the EM-DAT
em_dat <- read_csv("data/analysis/EM-DAT Measure.csv") %>%
  mutate(month = as.numeric(month),
         nd_em_dat = 1)

########

#### 4. The data on rainfall concentration: ####
# To calculate monthly level rainfall concentrations, 
# I use the precipitation data at the day and grid level derived from the IMERG: Integrated Multi-satellitE Retrievals for GPM.
# As the sizes are too big, I have not included the raw data for all precipitation .nc files for each grid at each day

# .nc files for the precipitation for each day and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading)

# First, I downloaded all .nc4 (NETCDF) files for every grid-day;
# Second, I conducted the conversion and clipping of a batch of .nc4 (NetCDF) files to .tif (GeoTIFF) format for spatial analysis, focusing on the Philippines.
# The second step is conducted using Bash scripts in the macOS Terminal with GDAL tools - the Command-line Bash scripts can be found in the replication file - code
# Third, I used R to loop through clipped all GeoTIFFs, extract the mean precipitation within each barangay polygon using "exactextractr", 
# and compile the results into a CSV file for zonal statistical analysis.

# below is the coding example

# input_dir <- "WHERE YOU STORE THE CONCERTED .TIF"  # Directory with raster files
# vector_file <- barangay  # Path to your vector file (barangy) 

# phl <- st_read(vector_file) # Load vector zones (as sf object for compatibility with exactextractr)

# raster_files <- list.files(input_dir, pattern = "\\.tif$", full.names = TRUE) ## List all raster files in the input directory

# zonal_results <- tibble() # Initialize an empty tibble to store results

# for (raster_file in raster_files) {
#   rainfallfilename <- basename(raster_file)
#   date_match <- gsub(".*\\.(\\d{4})(\\d{2})(\\d{2})-.*", "\\1-\\2-\\3", rainfallfilename) # Extract date (year, month, and day) from the filename
#   yearmonthday <- as.Date(date_match, format = "%Y-%m-%d")

#  rainfallrast <- rast(raster_file)

#  rainfallmean <- exact_extract(rainfallrast, phl, "mean") ## Calculate mean rainfall using exact_extract

#  tmpres <- tibble(
#    rainfallfile = rainfallfilename,
#    ADM4_PCODE = phl$ADM4_PCODE,
#    yearmonthday = yearmonthday,
#    rainfallsd = rainfallmean)

#   zonal_results <- bind_rows(zonal_results, tmpres)}

# Fourth, I calculate the monthly level rainfall concentration 
# Below are the codes example 
# zonal_results <- zonal_results %>%
#  mutate(
#    rainfallmean = as.numeric(rainfallmean),
#    year = str_sub(yearmonthday, start = 1, end = 4),
#    month = str_sub(yearmonthday, start = 6, end = 7),
#    day = str_sub(yearmonthday, start = 9, end = 10)) %>%
#  group_by(ADM4_PCODE, year, month) %>%
#  summarise(total_rainfall = sum(rainfallmean, na.rm = TRUE),
#            rci = ifelse(total_rainfall > 0,
#                 sum((rainfallmean^2), na.rm = TRUE) / (total_rainfall^2), NA))

# Finally, I created the dataset for the monthly rainfall concentration for each barangay and month 
# I rename it as "rainfall_zonal_stats_2011_2014_rainfall_concentraction" 

rainfallconcentraction <- st_read("./data/analysis/rainfall_zonal_stats_2011_2014_rainfall_concentraction.csv") %>%
  mutate(month = as.integer(month), 
         rci = as.numeric(rci)) %>%
  drop_na() %>% mutate(year = as.numeric(year))
########

#### C. Merge the data ####
dataset1_month <- dataset1 %>%
  mutate(year = as.numeric(year),
         Tchange_inf2 = as.numeric(Tchange_inf2)) %>%
  mutate(month = list(1:12)) %>% 
  unnest(cols = month) %>% 
  left_join(em_dat) %>% 
  mutate(nd_em_dat = if_else(is.na(nd_em_dat), 0, 1)) %>% 
  left_join(nd_month) %>%
  left_join(rainfallconcentraction) %>% mutate(time = paste(year, month, sep = "_"))

dataset1_month <- dataset1_month %>% mutate(wet_season = if_else(month == 5 | month == 6 | month == 7 | month == 8 | month == 9 | month == 10, 1, 0))

dataset1_month <- dataset1_month %>% dplyr::select(ADM3_PCODE, ADM4_PCODE, year, month, time, 
                                                   Tchange_inf2, Enclave_inf2, Mixed_inf2,
                                                   nd_em_dat, z_value_bar, rci, wet_season)
                                                   


write_csv(dataset1_month, "./data/analysis/dataset1_month.csv", na = "NA") 


#### D. Create the staggered adoption version 
# First. Convert 'time' to a proper Date object
dataset1_month <- dataset1_month %>%
  mutate(year = as.numeric(year),
         month = as.numeric(month),
         numeric_time = (year - 2011) * 12 + month)

# Second, code the first hit 
dataset1_month <- dataset1_month %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    first_disaster_year_month = min(numeric_time[ND_1.6_bar == 1], na.rm = TRUE),  # First disaster
    treated = ifelse(numeric_time >= first_disaster_year_month, 1, 0)  # Treated after first disaster
  ) %>%
  ungroup()

# Third, identify second disaster time
dataset1_month <- dataset1_month %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    second_disaster_time = nth(numeric_time[ND_1.6_bar == 1], 2, default = NA)) %>%
  ungroup()

# Fourth, Exclude months after the second disaster
dataset1_filtered <- dataset1_month %>%
  filter(is.na(second_disaster_time) | numeric_time <= second_disaster_time)

# Fifth, I exclude the disaster-affected barangays in 2010 and 2011. 
test <- output_f %>% filter(year == 2010 | year == 2011) %>% filter(z_value >= 1.6)
test1 <- test %>% group_by(ADM4_PCODE) %>% summarise(n=n())

dataset1_filtered_exclude_2010_2011 <- dataset1_filtered %>%
  filter(!ADM4_PCODE %in% test1$ADM4_PCODE)

dataset1_filtered_exclude_2010_2011 <- dataset1_filtered_exclude_2010_2011 %>% dplyr::select(ADM3_PCODE, ADM4_PCODE, year, month, time,
                                                                                             Tchange_inf2, Enclave_inf2, Mixed_inf2, treated,
                                                                                             nd_em_dat, z_value_bar, rci, wet_season, wet_season)

write_csv(dataset1_filtered_exclude_2010_2011, "./data/analysis/dataset1_month_staggeredadoption.csv", na = "NA") 

########

